Background

This notebook serves as a codebank for plotly charts.

In [57]:
# SAP Colors
from IPython.display import Image
Image(filename='../docs/sap_colors.png') 
Out[57]:

Import Libraries

In [76]:
import plotly.graph_objects as go
import numpy as np
import pandas as pd
import random
np.random.seed(1)

import plotly.express as px
df = px.data.tips()

import plotly.io as pio
# png_renderer = pio.renderers["png"]
pio.renderers.default = "notebook"
#plotly==4.2.1

Import Data

In [77]:
cv_f1 = [0.94, 0.89]
test_f1 = [0.58, 0.88]

fasttext_cnn_cv_f1_yes = [0.8254545454545454,
                          0.8493647912885662,
                          0.876611418047882,
                          0.9114391143911438,
                          0.9205776173285197,
                          0.9046728971962616,
                          0.8897058823529411,
                          0.908411214953271,
                          0.8847583643122676,
                          0.9150090415913201,
                          0.8959107806691449,
                          0.8800000000000001]

fasttext_rnn_cv_f1_yes = [0.823747680890538,
                          0.8683274021352313,
                          0.8624535315985131,
                          0.8781362007168458,
                          0.9027522935779816,
                          0.884476534296029,
                          0.8897058823529411,
                          0.8962962962962961,
                          0.8897058823529411,
                          0.9087591240875912,
                          0.8929889298892989,
                          0.8929889298892989]

flair_cnn_cv_f1_yes = [0.823747680890538,
                       0.8734402852049911,
                       0.8390596745027126,
                       0.866785079928952,
                       0.896551724137931,
                       0.8782287822878228,
                       0.8530465949820789,
                       0.8940754039497307,
                       0.8856624319419237,
                       0.901669758812616,
                       0.8876611418047881,
                       0.88]

elmo_small_cv_f1_yes = [0.7943262411347518,
                        0.8729874776386405,
                        0.8655616942909761,
                        0.8954128440366974,
                        0.9199999999999999,
                        0.8909090909090909,
                        0.8913443830570903,
                        0.9114391143911438,
                        0.8947368421052632,
                        0.9038112522686025,
                        0.9117647058823529,
                        0.8917910447761194]

bert_cv_f1_yes = [0.8318264014466548,
                  0.7951807228915663,
                  0.8698884758364313,
                  0.867992766726944,
                  0.9068541300527241,
                  0.874296435272045,
                  0.8908765652951699,
                  0.9174311926605504,
                  0.9018181818181819,
                  0.9227941176470589,
                  0.90625,
                  0.8991150442477875]

final_extension_policy_test_f1_yes = [0.6923076923076923,
                                      0.9600000000000001,
                                      0.9230769230769231,
                                      1.0,
                                      0.8695652173913043,
                                      0.9565217391304348,
                                      0.9230769230769231,
                                      1.0,
                                      1.0,
                                      0.9600000000000001,
                                      1.0,
                                      1.0]
In [78]:
first_type_dict = {'ContractDocuments': {'orderform': 125,
                                         'trialorderform': 10,
                                         'addendum': 7,
                                         'amendment': 36,
                                         'addorderform': 7,
                                         'appendix': 1,
                                         'termination': 2,
                                         'purchaseorder': 6,
                                         'renewalorderform': 1,
                                         'others': 1,
                                         'supplementaltnc': 0,
                                         'changerequest': 1},
                   'Orderform': {'orderform': 90,
                                 'trialorderform': 7,
                                 'addendum': 3,
                                 'amendment': 21,
                                 'addorderform': 22,
                                 'appendix': 0,
                                 'termination': 2,
                                 'purchaseorder': 28,
                                 'renewalorderform': 2,
                                 'others': 0,
                                 'supplementaltnc': 1,
                                 'changerequest': 0},
                   'Mastercontract': {'orderform': 63,
                                      'trialorderform': 1,
                                      'addendum': 1,
                                      'amendment': 0,
                                      'addorderform': 18,
                                      'appendix': 0,
                                      'termination': 0,
                                      'purchaseorder': 0,
                                      'renewalorderform': 0,
                                      'others': 2,
                                      'supplementaltnc': 0,
                                      'changerequest': 0},
                   'Terms&Conditions(allformats)': {'orderform': 0,
                                                    'trialorderform': 0,
                                                    'addendum': 0,
                                                    'amendment': 0,
                                                    'addorderform': 0,
                                                    'appendix': 0,
                                                    'termination': 0,
                                                    'purchaseorder': 0,
                                                    'renewalorderform': 0,
                                                    'others': 1,
                                                    'supplementaltnc': 0,
                                                    'changerequest': 0},
                   'MasterContract': {'orderform': 4,
                                      'trialorderform': 0,
                                      'addendum': 0,
                                      'amendment': 0,
                                      'addorderform': 0,
                                      'appendix': 0,
                                      'termination': 0,
                                      'purchaseorder': 0,
                                      'renewalorderform': 0,
                                      'others': 0,
                                      'supplementaltnc': 0,
                                      'changerequest': 0},
                   'Amendments': {'orderform': 0,
                                  'trialorderform': 0,
                                  'addendum': 0,
                                  'amendment': 1,
                                  'addorderform': 0,
                                  'appendix': 0,
                                  'termination': 0,
                                  'purchaseorder': 0,
                                  'renewalorderform': 0,
                                  'others': 0,
                                  'supplementaltnc': 0,
                                  'changerequest': 0}}
In [79]:
df.head()
Out[79]:
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4

Boxplots

Horizontal Boxplots

In [80]:
fil = (df['sex']=='Female') & (df['smoker']=='No')
x0 = df[fil]['tip'].values
fil = (df['sex']=='Female') & (df['smoker']=='Yes')
x1 = df[fil]['tip'].values
fil = (df['sex']=='Male') & (df['smoker']=='No')
x2 = df[fil]['tip'].values
fil = (df['sex']=='Male') & (df['smoker']=='Yes')
x3 = df[fil]['tip'].values


fig = go.Figure()
fig.add_trace(go.Box(
    x=x0,
    name='Female (smoker)',
    marker = dict(color='#ffaf00'),
    boxmean=True, # represent mean,
    jitter=0.3,
    pointpos=-1.8,
    boxpoints='all',
))
fig.add_trace(go.Box(
    x=x1,
    name='Female (non-smoker)',
    marker = dict(color='#e35500'),
    boxmean=True, # represent mean and standard deviation
    jitter=0.3,
    pointpos=-1.8,
    boxpoints='all',
))
fig.add_trace(go.Box(
    x=x2,
    name='Male (smoker)',
    marker = dict(color='#cccccc'),
    boxmean=True, # represent mean,
    jitter=0.3,
    pointpos=-1.8,
    boxpoints='all',
))
fig.add_trace(go.Box(
    x=x3,
    name='Male (non-smoker)',
    marker = dict(color='#999999'),
    boxmean=True, # represent mean and standard deviation
    jitter=0.3,
    pointpos=-1.8,
    boxpoints='all',
))
fig.update_traces(orientation='h')
fig.update_layout(
    title='Tip Amount (Female vs. Male) <b><br>Among smokers and non-smokers<br></b>',
    xaxis = dict(title='Tip',
                 range=[-1,11],
                ),
    yaxis=dict(
    ),
    margin=dict(
        l=60,
        r=30,
        b=80,
        t=100,
    ),
     autosize=False,
    width=800,
    height=500,
    showlegend=False,
    template ='plotly_white',
)
fig.show(renderer="notebook")

Vertical Boxplots

In [81]:
trace0 = go.Box(
    y=fasttext_cnn_cv_f1_yes,
    name = 'fastText + cnn',
    text=fasttext_cnn_cv_f1_yes,
#     textposition = 'auto',
    marker = dict(
        color = '#4ac6b7',
    )
)
trace1 = go.Box(
    y=fasttext_rnn_cv_f1_yes,
    name = 'fastText + rnn',
    text=fasttext_rnn_cv_f1_yes,
#     textposition = 'auto',
    marker = dict(
        color = '#4f5e7f',
    )
)
trace2 = go.Box(
    y=flair_cnn_cv_f1_yes,
    name = 'flair + cnn',
    text=flair_cnn_cv_f1_yes,
    marker = dict(
        color = '#965f8a',
    )
)
trace3 = go.Box(
    y=elmo_small_cv_f1_yes,
    name = 'elmo + cnn',
    text=elmo_small_cv_f1_yes,
    marker = dict(
        color = '#ff7070',
    )
)
trace4 = go.Box(
    y=bert_cv_f1_yes,
    name = 'bert + cnn',
    text=bert_cv_f1_yes,
    marker = dict(
        color = '#f8aa27',
    )
)
data = [trace0, trace1, trace2, trace3, trace4]
fig = go.Figure(data=data)
fig.update_layout(
    title='F1-Yes by Modelling Approach<b><br>Repeated 6-fold CV</b>',
    xaxis=dict(title='Models',),
    yaxis=dict(title='CV F1-Yes',),
    margin=dict(
        l=60,
        r=30,
        b=80,
        t=100,
    ),
    autosize=False,
    width=800,
    height=500,
    showlegend=False,
    template='plotly_white',
)
fig.show()

Ridgeline Plot

In [82]:
fil = (df['sex']=='Female') & (df['smoker']=='No')
x0 = df[fil]['tip'].values
fil = (df['sex']=='Female') & (df['smoker']=='Yes')
x1 = df[fil]['tip'].values
fil = (df['sex']=='Male') & (df['smoker']=='No')
x2 = df[fil]['tip'].values
fil = (df['sex']=='Male') & (df['smoker']=='Yes')
x3 = df[fil]['tip'].values

data = [x0, x1, x2, x3]

# colors = n_colors('rgb(5, 200, 200)', 'rgb(200, 10, 10)', 12, colortype='rgb')
colors = ['#ffaf00', '#e35500', '#cccccc', '#999999']
names = ['Female (smoker)', 'Female (non-smoker)', 'Male (smoker)', 'Male (non-smoker)']
fig = go.Figure()
for data_line, color, x in zip(data, colors, names):
    fig.add_trace(go.Violin(x=data_line, line_color=color, name=x,))

fig.update_traces(orientation='h', side='positive', width=3, points=False)
fig.update_traces(meanline_visible=True,
#                   points='all', # show all points
                  jitter=0.05,  # add some jitter on points for better visibility
                  scalemode='count') #scale violin plot area with total count
# fig.update_layout(xaxis_showgrid=False, xaxis_zeroline=False)
fig.update_layout(
    title='Tip Amount (Female vs. Male) <b><br>Among smokers and non-smokers<br></b>',
    xaxis=dict(title='Tip'),
    yaxis=dict(
    ),
    margin=dict(
        l=60,
        r=30,
        b=80,
        t=100,
    ),
    autosize=False,
    width=800,
    height=500,
    showlegend=True,
    template='plotly_white',
)
fig.show()

Bar Chart

Sort Order of Bars

The 'categoryorder' property is an enumeration that may be specified as:
      - One of the following enumeration values:
            ['trace', 'category ascending', 'category descending',
            'array', 'total ascending', 'total descending', 'min
            ascending', 'min descending', 'max ascending', 'max
            descending', 'sum ascending', 'sum descending', 'mean
            ascending', 'mean descending', 'median ascending', 'median
            descending']
In [83]:
fig = go.Figure()
fig.add_trace(go.Bar(
    x=df.groupby('day')['tip'].sum().index,
    y=df.groupby('day')['tip'].sum().values,
    text=df.groupby('day')['tip'].sum().values,
    name='Primary Product',
    marker_color='#eb7300',
))

# Here we modify the tickangle of the xaxis, resulting in rotated labels.

fig.update_layout(barmode='group')
fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')
fig.update_layout(
    title='Total Tips Received <b><br>Daily</b>',
    xaxis = dict(tickangle=-45,
               categoryorder='total descending',),
    yaxis=dict(showticklabels=False,
               title='Tips Amount'
    ),
    margin=dict(
        l=60,
        r=30,
        b=80,
        t=80,
    ),
     autosize=False,
    width=800,
    height=500,
    showlegend=False,
    template ='plotly_white',
)
fig.show()

Horizontal Bars

In [84]:
fig = go.Figure()
fig.add_trace(go.Bar(
    x=df.groupby('day')['tip'].sum().values,
    y=df.groupby('day')['tip'].sum().index,
    text=df.groupby('day')['tip'].sum().values,
    name='Primary Product',
    marker_color='#eb7300',
    orientation='h',
))

fig.update_layout(barmode='stack')
fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')
fig.update_layout(
    title='Total Tips Received <b><br>Daily</b>',
    xaxis=dict(title='Tips Amount',),
    yaxis=dict(showticklabels=True,
               categoryorder='total descending',
               ),
    margin=dict(
        l=60,
        r=30,
        b=80,
        t=80,
    ),
    autosize=False,
    width=800,
    height=500,
    showlegend=False,
    template='plotly_white',
)
fig.show()

Customer bar and line color

In [85]:
labels = ['Extension Policy', 'Analyses']
names = ['CV F1-Yes', 'Test F1-Yes']

x = labels
y = [0.94, 0.89]
y2 = [0.58, 0.88]

trace1 = go.Bar(
    x=x,
    y=y,
    text=y,
    textposition='auto',
    name=names[0],
    marker=dict( # customize bar and line color 
        color='#f9a828',
        line=dict(
            color='rgb(8,48,107)',
            width=1),
    ),
    opacity=0.6,
)

trace2 = go.Bar(
    x=x,
    y=y2,
    text=y2,
    textposition='auto',
    name=names[1],
    marker=dict(
        color='#07617d',
        line=dict(
            color='rgb(8,48,107)',
            width=1),
    ),
    opacity=0.6,
)


fig = go.Figure(data=[trace1, trace2],)
fig.update_layout(barmode='group',
                  # gap between bars of adjacent location coordinates.
                  bargap=0.15,
                  # gap between bars of the same location coordinate.
                  bargroupgap=0.1
                  )
fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')
fig.update_layout(
    title='F1-Yes <br><b>5-fold CV VS. Test</b>',
    xaxis=dict(title='Data Fields',),
    yaxis=dict(title='f1-score (Yes)',
               showticklabels=True,
               categoryorder='total descending',
               ),
    margin=dict(
        l=60,
        r=30,
        b=80,
        t=80,
    ),
    autosize=False,
    width=800,
    height=500,
    showlegend=False,
    template='plotly_white',
)
fig.show()

Stacked Bars

In [86]:
# hardcode data
df['is_married'] = df.apply(lambda x: 'Yes' if np.random.rand() > .3 else 'No', axis=1)
df['is_employed'] = df.apply(lambda x: 'Yes' if np.random.rand() > .1 else 'No', axis=1)
df['is_senior'] = df.apply(lambda x: 'Yes' if np.random.rand() > .9 else 'No', axis=1)
df['is_child'] = df.apply(lambda x: 'Yes' if np.random.rand() > .9 else 'No', axis=1)
df['is_regular'] = df.apply(lambda x: 'Yes' if np.random.rand() > .5 else 'No', axis=1)
df['is_member'] = df.apply(lambda x: 'Yes' if np.random.rand() > .8 else 'No', axis=1)
df['is_big_spender'] = df.apply(lambda x: 'Yes' if np.random.rand() > .7 else 'No', axis=1)
df['is_lives_near'] = df.apply(lambda x: 'Yes' if np.random.rand() > .8 else 'No', axis=1)
df['is_lives_near'] = df['is_lives_near'].apply(lambda x: x if np.random.rand() > .3 else 'Unknown')
In [87]:
cols_list = ['smoker', 'is_married', 'is_employed', 'is_senior', 'is_child', 'is_regular',
       'is_member', 'is_big_spender', 'is_lives_near']
In [88]:
no_list, yes_list, unknown_list = [], [], []
for i in cols_list:
    no_list.append(df[df[i] == 'No'].shape[0])
    yes_list.append(df[df[i] == 'Yes'].shape[0])
    unknown_list.append(df[df[i] == 'Unknown'].shape[0])

x = cols_list
y = no_list
y_text = [str(round(x / df.shape[0] * 100, 2)) + '%' for x in no_list]
y2 = yes_list
y2_text = [str(round(x / df.shape[0] * 100, 2)) + '%' for x in yes_list]
y3 = unknown_list
y3_text = [str(round(x / df.shape[0] * 100, 2)) + '%' for x in unknown_list]

trace1 = go.Bar(x=x,
                y=y,
                text=y_text,
                textposition='auto',
                name='No',
                marker=dict(
                    color='#3ec1d3',
                    line=dict(color='rgb(8,48,107)', width=1.5),
                ),
                opacity=0.6)

trace2 = go.Bar(x=x,
                y=y2,
                text=y2_text,
                textposition='auto',
                name='Yes',
                marker=dict(
                    color='#f6f7d7',
                    line=dict(color='rgb(8,48,107)', width=1.5),
                ),
                opacity=0.6)

trace3 = go.Bar(x=x,
                y=y3,
                text=y3_text,
                textposition='auto',
                name='Unknown',
                marker=dict(
                    color='#ff9a00',
                    line=dict(color='rgb(8,48,107)', width=1.5),
                ),
                opacity=0.6)

data = [trace1, trace2, trace3]
layout = go.Layout(
    xaxis=dict(tickangle=45, ),
    yaxis=dict(title='count', ),
    title="Customers' Survey Data <br><b>from 01 Jan to 07 Jan</b>",
    margin=dict(
        l=60,
        r=30,
        b=80,
        t=80,
    ),
    autosize=False,
    width=800,
    height=500,
    showlegend=True,
    template='plotly_white',
    barmode='stack')

fig = go.Figure(data=data, layout=layout)
fig.show()

Pie Charts

In [89]:
colors = ['orange', 'lightblue']

fig = go.Figure(data=[go.Pie(labels=['Below Threshold','Above Threshold'],
                             rotation = 30,
                             values=[80,2821])])
fig.update_traces(hoverinfo='label+percent', 
                  textinfo='value+label+percent',
                  textposition='outside',
                  hole=.4, #size of pie hole
                  textfont_size=12,
                  pull=[0, 0.05], #pull distance of each slice
                  marker=dict(colors=colors, line=dict(color='#000000', width=0.1)))

fig.update_layout(
    title='On-Premise OCR Scores (Threshold 0.81)',
    margin=dict(
        l=60,
        r=30,
        b=80,
        t=100,
    ),
     autosize=False,
    width=800,
    height=500,
    showlegend=True,
    template ='plotly_white',
)
fig.show()

Line Chart

In [90]:
df_automation_rate = pd.read_csv('../data/df_automation_rate_ar.csv',index_col=[0])
df_automation_rate.head(2)
Out[90]:
threshold automation_rate f1_no f1_yes precision_yes recall_yes
0 0.00 1.000000 0.986039 0.661597 0.679688 0.644444
1 0.01 0.998795 0.986349 0.661479 0.685484 0.639098
In [91]:
trace1 = go.Scatter(x=df_automation_rate['threshold'].values,
                    y=df_automation_rate['automation_rate'].values,
                    mode='lines',
                    name='Automated Ratio')

trace2 = go.Scatter(x=df_automation_rate['threshold'].values,
                    y=df_automation_rate['f1_yes'].values,
                    mode='lines',
                    name='F1 \'Yes\'')

trace3 = go.Scatter(x=df_automation_rate['threshold'].values,
                    y=df_automation_rate['precision_yes'].values,
                    mode='lines',
                    name='Precision \'Yes\'')

trace4 = go.Scatter(x=df_automation_rate['threshold'].values,
                    y=df_automation_rate['recall_yes'].values,
                    mode='lines',
                    name='Recall \'Yes\'')

data = [trace1, trace2, trace3, trace4]

fig = go.Figure(data=data, layout=layout)
fig.update_layout(
    title='Threshold Analysis<br><b>EN Cloud Audit Rights</b>',
    xaxis=dict(
        title='Threshold probability',
        autorange=False,
        fixedrange=True,
        gridcolor="rgb(204, 204, 204)",
        range=[0, 1],
        showline=True,
        showticklabels=True,
        ticks="outside",
    ),
    yaxis=dict(
        title='F1-Score (Yes)',
        autorange=False,
        fixedrange=True,
        gridcolor="rgb(204, 204, 204)",
        range=[0, 1],
        showline=True,
        showticklabels=True,
        ticks="outside",
    ),
    margin=dict(
        l=60,
        r=30,
        b=80,
        t=80,
    ),
    autosize=False,
    width=800,
    height=500,
    showlegend=True,
    legend=dict( # change legend position
        orientation="h",
        yanchor="bottom",
        y=-0.4,
        xanchor="right",
        x=1
    ),
    template='plotly_white',
)
fig.show()

Sankey

In [92]:
# sankey left unique names
list(first_type_dict.keys())
Out[92]:
['ContractDocuments',
 'Orderform',
 'Mastercontract',
 'Terms&Conditions(allformats)',
 'MasterContract',
 'Amendments']
In [93]:
# sankey right unique names
[list(v.keys()) for k,v in first_type_dict.items()][0]
Out[93]:
['orderform',
 'trialorderform',
 'addendum',
 'amendment',
 'addorderform',
 'appendix',
 'termination',
 'purchaseorder',
 'renewalorderform',
 'others',
 'supplementaltnc',
 'changerequest']
In [94]:
# first_type_dict is the mapping dict

t = [list(v.values()) for k,v in first_type_dict.items()]
flat_list = [item for sublist in t for item in sublist]

unique_left = list(first_type_dict.keys())
unique_right = [list(v.keys()) for k,v in first_type_dict.items()][0]
labels = unique_left + unique_right

labels = labels
colors = ['#ffaf00' for x in range(len(unique_left))] + ["#BCDc50" for x in range(len(unique_right))]
fig = go.Figure(data=[go.Sankey(
    valueformat=".0f",
    valuesuffix=" docs",
    # Define nodes
    node=dict(
        pad=15,
        thickness=15,
        line=dict(color="black", width=0.5),
        label=labels,
        color=colors,
    ),
    # Add links
    link=dict(
        source=[x//len(unique_right) for x in range(len(unique_left)*len(unique_right))],
        target=[i for i in range(len(unique_left), len(labels))]*len(unique_left),
        value=flat_list,
        label=labels,
        color="rgba(4, 1, 1, 0.15)"

    ))])

fig.update_layout(title_text="<b>Document Type in Filename (left) VS. Actual Document Type (right)</b><br>Based on 468 manually reviewed documents",
                  font_size=10,
                  margin=dict(
                      l=60,
                      r=30,
                      b=80,
                      t=100,
                  ),
                  autosize=False,
                  width=800,
                  height=500,
                  showlegend=False,
                  template='plotly_white',)
fig.show()

Export as HTML

In [95]:
# save before you export
!jupyter nbconvert plotly_codebank_4.2.1_041120.ipynb --template toc2
[NbConvertApp] Converting notebook plotly_codebank_4.2.1_041120.ipynb to html
[NbConvertApp] Writing 4668362 bytes to plotly_codebank_4.2.1_041120.html
In [ ]: